import packages
#command:jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
#import lda
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
#from bokeh.transform import factor_cmap
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
# size of training and dataset
print(train.shape)
print(test.shape)
(1482535, 8) (693359, 7)
# different data types in the dataset: categorical (strings) and numeric
train.dtypes
train_id int64 name object item_condition_id int64 category_name object brand_name object price float64 shipping int64 item_description object dtype: object
train.head()
| train_id | name | item_condition_id | category_name | brand_name | price | shipping | item_description | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | MLB Cincinnati Reds T Shirt Size XL | 3 | Men/Tops/T-shirts | NaN | 10.0 | 1 | No description yet |
| 1 | 1 | Razer BlackWidow Chroma Keyboard | 3 | Electronics/Computers & Tablets/Components & P... | Razer | 52.0 | 0 | This keyboard is in great condition and works ... |
| 2 | 2 | AVA-VIV Blouse | 1 | Women/Tops & Blouses/Blouse | Target | 10.0 | 1 | Adorable top with a hint of lace and a key hol... |
| 3 | 3 | Leather Horse Statues | 1 | Home/Home Décor/Home Décor Accents | NaN | 35.0 | 1 | New with tags. Leather horses. Retail for [rm]... |
| 4 | 4 | 24K GOLD plated rose | 1 | Women/Jewelry/Necklaces | NaN | 44.0 | 0 | Complete with certificate of authenticity |
Process the suggested price we will provide using log transformation.
train.price.describe()
count 1.482535e+06 mean 2.673752e+01 std 3.858607e+01 min 0.000000e+00 25% 1.000000e+01 50% 1.700000e+01 75% 2.900000e+01 max 2.009000e+03 Name: price, dtype: float64
Comparison of the distribution of price attributes before and after transformation.
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white',range=[0,250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)
plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)
plt.show()
Shipping cost: Approximately 55% of sellers bear the shipping cost.
train.shipping.value_counts()/len(train)
0 0.552726 1 0.447274 Name: shipping, dtype: float64
Look at the price changes under different shipping situations.
prc_shipBySeller = train.loc[train.shipping==1, 'price']
prc_shipByBuyer = train.loc[train.shipping==0, 'price']
fig, ax = plt.subplots(figsize=(20,10))
ax.hist(np.log(prc_shipBySeller+1), color='#8CB4E1', alpha=1.0, bins=50,
label='Price when Seller pays Shipping')
ax.hist(np.log(prc_shipByBuyer+1), color='#007D00', alpha=0.7, bins=50,
label='Price when Buyer pays Shipping')
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
plt.legend()
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
plt.show()
It seems that the average price for users who pay for shipping themselves is lower than for sellers who offer free shipping...
Product category
print("There are %d unique values in the category column." % train['category_name'].nunique())
There are 1287 unique values in the category column.
# TOP 5 RAW CATEGORIES
train['category_name'].value_counts()[:5]
Women/Athletic Apparel/Pants, Tights, Leggings 60177 Women/Tops & Blouses/T-Shirts 46380 Beauty/Makeup/Face 34335 Beauty/Makeup/Lips 29910 Electronics/Video Games & Consoles/Games 26557 Name: category_name, dtype: int64
# missing categories
print("There are %d items that do not have a label." % train['category_name'].isnull().sum())
There are 6327 items that do not have a label.
Subdivide the category
def split_cat(text):
try: return text.split("/")
except: return ("No Label", "No Label", "No Label")
train['general_cat'], train['subcat_1'], train['subcat_2'] = \
zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()
| train_id | name | item_condition_id | category_name | brand_name | price | shipping | item_description | general_cat | subcat_1 | subcat_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | MLB Cincinnati Reds T Shirt Size XL | 3 | Men/Tops/T-shirts | NaN | 10.0 | 1 | No description yet | Men | Tops | T-shirts |
| 1 | 1 | Razer BlackWidow Chroma Keyboard | 3 | Electronics/Computers & Tablets/Components & P... | Razer | 52.0 | 0 | This keyboard is in great condition and works ... | Electronics | Computers & Tablets | Components & Parts |
| 2 | 2 | AVA-VIV Blouse | 1 | Women/Tops & Blouses/Blouse | Target | 10.0 | 1 | Adorable top with a hint of lace and a key hol... | Women | Tops & Blouses | Blouse |
| 3 | 3 | Leather Horse Statues | 1 | Home/Home Décor/Home Décor Accents | NaN | 35.0 | 1 | New with tags. Leather horses. Retail for [rm]... | Home | Home Décor | Home Décor Accents |
| 4 | 4 | 24K GOLD plated rose | 1 | Women/Jewelry/Necklaces | NaN | 44.0 | 0 | Complete with certificate of authenticity | Women | Jewelry | Necklaces |
# repeat the same step for the test set
test['general_cat'], test['subcat_1'], test['subcat_2'] = \
zip(*test['category_name'].apply(lambda x: split_cat(x)))
print("There are %d unique first sub-categories." % train['subcat_1'].nunique())
There are 114 unique first sub-categories.
print("There are %d unique second sub-categories." % train['subcat_2'].nunique())
There are 871 unique second sub-categories.
Overall, we have 7 main categories (114 in the first subcategory and 871 in the second subcategory): Women and beauty items are the most popular categories (over 50% of observations), followed by children and electronic products.
Distribution of major categories:
x = train['general_cat'].value_counts().index.values.astype('str')
y = train['general_cat'].value_counts().values
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))]
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
#import plotly.tools as tls
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title= 'Number of Items by Main Category',
yaxis = dict(title='Count'),
xaxis = dict(title='Category'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
Distribution of subcat_1
x = train['subcat_1'].value_counts().index.values.astype('str')[:15]
y = train['subcat_1'].value_counts().values[:15]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))][:15]
trace1 = go.Bar(x=x, y=y, text=pct,
marker=dict(
color = y,colorscale='Portland',showscale=True,
reversescale = False
))
layout = dict(title= 'Number of Items by Sub Category (Top 15)',
yaxis = dict(title='Count'),
xaxis = dict(title='SubCategory'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
general_cats = train['general_cat'].unique()
x = [train.loc[train['general_cat']==cat, 'price'] for cat in general_cats]
data = [go.Box(x=np.log(x[i]+1), name=general_cats[i]) for i in range(len(general_cats))]
layout = dict(title="Price Distribution by General Category",
yaxis = dict(title='Frequency'),
xaxis = dict(title='Category'))
fig = dict(data=data, layout=layout)
py.iplot(fig)